In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
In [2]:
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn._oldcore")
In [3]:
!pip install chardet
Requirement already satisfied: chardet in c:\users\kiran\anaconda3\lib\site-packages (4.0.0)
In [7]:
import chardet

with open('spam.csv', 'rb') as f:
    result = chardet.detect(f.read())
    encoding = result['encoding']

df = pd.read_csv("spam.csv", encoding=encoding)
df.head()
Out[7]:
v1 v2 Unnamed: 2 Unnamed: 3 Unnamed: 4
0 ham Go until jurong point, crazy.. Available only ... NaN NaN NaN
1 ham Ok lar... Joking wif u oni... NaN NaN NaN
2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN NaN NaN
3 ham U dun say so early hor... U c already then say... NaN NaN NaN
4 ham Nah I don't think he goes to usf, he lives aro... NaN NaN NaN
In [8]:
df=df.iloc[:,[0,1]]
In [9]:
df.head()
Out[9]:
v1 v2
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...
In [10]:
df.shape
Out[10]:
(5572, 2)
In [11]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
In [12]:
df.describe()
Out[12]:
v1 v2
count 5572 5572
unique 2 5169
top ham Sorry, I'll call later
freq 4825 30
In [13]:
df.rename(columns ={"v1":"output","v2":"sms"},inplace=True)
In [14]:
from sklearn.preprocessing import LabelEncoder
In [15]:
lb=LabelEncoder()
df["output"]=lb.fit_transform(df['output'])
df.head(3)
Out[15]:
output sms
0 0 Go until jurong point, crazy.. Available only ...
1 0 Ok lar... Joking wif u oni...
2 1 Free entry in 2 a wkly comp to win FA Cup fina...
In [16]:
lb.classes_
Out[16]:
array(['ham', 'spam'], dtype=object)
In [17]:
df.duplicated().sum()
Out[17]:
403
In [18]:
df.drop_duplicates(keep="first",inplace=True)
df.shape
Out[18]:
(5169, 2)
In [19]:
df['output'].value_counts()
Out[19]:
output
0    4516
1     653
Name: count, dtype: int64
In [20]:
import regex
In [21]:
df['num_characters']=df['sms'].apply(len)
In [22]:
df.head(2)
Out[22]:
output sms num_characters
0 0 Go until jurong point, crazy.. Available only ... 111
1 0 Ok lar... Joking wif u oni... 29
In [23]:
import nltk
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[23]:
True
In [24]:
df['num_of_words']=df.apply(lambda row: len(nltk.word_tokenize(row['sms'])), axis=1)
df.head(2)
Out[24]:
output sms num_characters num_of_words
0 0 Go until jurong point, crazy.. Available only ... 111 24
1 0 Ok lar... Joking wif u oni... 29 8
In [25]:
df['num_of_sentences']=df.apply(lambda row: len(nltk.sent_tokenize(row['sms'])), axis=1)
df.head(2)
Out[25]:
output sms num_characters num_of_words num_of_sentences
0 0 Go until jurong point, crazy.. Available only ... 111 24 2
1 0 Ok lar... Joking wif u oni... 29 8 2
In [26]:
df[['num_characters','num_of_words','num_of_sentences']].describe()
Out[26]:
num_characters num_of_words num_of_sentences
count 5169.000000 5169.000000 5169.000000
mean 78.977945 18.455794 1.965564
std 58.236293 13.324758 1.448541
min 2.000000 1.000000 1.000000
25% 36.000000 9.000000 1.000000
50% 60.000000 15.000000 1.000000
75% 117.000000 26.000000 2.000000
max 910.000000 220.000000 38.000000
In [27]:
(df[df['output']==0][['num_characters','num_of_words','num_of_sentences']]).describe()
Out[27]:
num_characters num_of_words num_of_sentences
count 4516.000000 4516.000000 4516.000000
mean 70.459256 17.123782 1.820195
std 56.358207 13.493970 1.383657
min 2.000000 1.000000 1.000000
25% 34.000000 8.000000 1.000000
50% 52.000000 13.000000 1.000000
75% 90.000000 22.000000 2.000000
max 910.000000 220.000000 38.000000
In [28]:
(df[df['output']==1][['num_characters','num_of_words','num_of_sentences']]).describe()
Out[28]:
num_characters num_of_words num_of_sentences
count 653.000000 653.000000 653.000000
mean 137.891271 27.667688 2.970904
std 30.137753 7.008418 1.488425
min 13.000000 2.000000 1.000000
25% 132.000000 25.000000 2.000000
50% 149.000000 29.000000 3.000000
75% 157.000000 32.000000 4.000000
max 224.000000 46.000000 9.000000
In [29]:
import seaborn as sns
import matplotlib.pyplot as plt
In [30]:
plt.figure(figsize=(10,6))
sns.histplot(data=df,x='num_characters',hue='output')
plt.plot()
Out[30]:
[]
No description has been provided for this image
In [31]:
sns.pairplot(df,hue="output")
Out[31]:
<seaborn.axisgrid.PairGrid at 0x1ba9f75a790>
No description has been provided for this image
In [32]:
def lower_text(x):
  return x.lower()
In [33]:
df['sms']=df['sms'].apply(lower_text)
In [34]:
pattern = r'[^a-zA-Z0-9\s]'
df['sms'] = df['sms'].str.replace(pattern, '', regex=True)
In [35]:
from nltk.corpus import stopwords
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[35]:
True
In [36]:
def remove_stopwords(x):
  tex=[]
  for i in x.split(' '):
      if(i not in stopwords.words('english')):
        tex.append(i)
  return tex
In [37]:
df['transformed_txt']=df['sms'].apply(remove_stopwords)
In [38]:
from nltk.stem.porter import PorterStemmer
In [39]:
ps=PorterStemmer()
In [40]:
def stemming(x):
  tex=[]
  for i in x:
      tex.append(ps.stem(i))
  return " ".join(tex)
In [41]:
df['transformed_txt']=df['transformed_txt'].apply(stemming)
In [42]:
df.head()
Out[42]:
output sms num_characters num_of_words num_of_sentences transformed_txt
0 0 go until jurong point crazy available only in ... 111 24 2 go jurong point crazi avail bugi n great world...
1 0 ok lar joking wif u oni 29 8 2 ok lar joke wif u oni
2 1 free entry in 2 a wkly comp to win fa cup fina... 155 37 2 free entri 2 wkli comp win fa cup final tkt 21...
3 0 u dun say so early hor u c already then say 49 13 1 u dun say earli hor u c alreadi say
4 0 nah i dont think he goes to usf he lives aroun... 61 15 1 nah dont think goe usf live around though
In [43]:
from sklearn.feature_extraction.text import CountVectorizer
In [44]:
cv=CountVectorizer()
In [45]:
x=cv.fit_transform(df['transformed_txt']).toarray()
x
Out[45]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)
In [46]:
y=df['output']
y
Out[46]:
0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: output, Length: 5169, dtype: int32
In [47]:
from sklearn.model_selection import train_test_split
In [48]:
X_train,X_test,Y_train,Y_test=train_test_split(x,y)
In [49]:
X_train.shape
Out[49]:
(3876, 7983)
In [50]:
Y_train.shape
Out[50]:
(3876,)
In [51]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.svm import SVC
In [52]:
from sklearn.metrics import accuracy_score, precision_score
In [53]:
gnb=GaussianNB()
bnb=BernoulliNB()
In [54]:
gnb.fit(X_train,Y_train)
gnb_pred=gnb.predict(X_test)
In [55]:
print("Train Score: ",gnb.score(X_train,Y_train))
print("Test Score: ",accuracy_score(Y_test, gnb_pred))
Train Score:  0.9396284829721362
Test Score:  0.8692962103634957
In [56]:
bnb.fit(X_train,Y_train)
bnb_pred=bnb.predict(X_test)

print("Train Score: ",bnb.score(X_train,Y_train))
print("Test Score: ",accuracy_score(Y_test, bnb_pred))
Train Score:  0.9806501547987616
Test Score:  0.9667440061871616
In [ ]: